#include "opencl_source_map.hpp" 
namespace MNN { 
const char* buffer_convert_quant = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
"#define GLOBAL_SIZE_2_DIMS __private const int global_size_dim0,__private const int global_size_dim1,\n"
"#define DEAL_NON_UNIFORM_DIM2(input1, input2) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { "" return; "" }\n"
"#ifdef USE_LOW_BIT_WEIGHT_INT8\n"
"// convert kernel : from int8 buffer(oihw) to int8 image(oc/4 h w ,ic oc4)\n"
"__kernel void conv2d_filter_buffer_to_nc4hw4_buffer_int8(GLOBAL_SIZE_2_DIMS\n"
" __global const char *input_ptr,\n"
" __private const int output_channel,\n"
" __private const int2 kernel_shape,\n"
" __private const int ic_h_w_size,\n"
" __private const int height_width_size,\n"
" __global char *output) {\n"
" int image_width_idx=get_global_id(0); // ic\n"
" int image_height_idx=get_global_id(1); // oc/4 h w\n"
" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
" const int input_channel_4_idx=image_width_idx;\n"
" const int output_channel_4_idx=(image_height_idx/height_width_size)*4;\n"
" const int height_width_idx=image_height_idx % height_width_size;\n"
" const int buffer_height_idx=height_width_idx/kernel_shape.y;\n"
" const int buffer_width_idx=height_width_idx % kernel_shape.y;\n"
" const int buffer_offset=output_channel_4_idx*ic_h_w_size+input_channel_4_idx*height_width_size +\n"
" buffer_height_idx*kernel_shape.y+buffer_width_idx;\n"
" char4 output_values=0;\n"
" if (output_channel_4_idx<output_channel) {\n"
" const int remain_channel=output_channel-output_channel_4_idx;\n"
" if (remain_channel >= 4) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(char)(*(input_ptr+offset));\n"
" offset=mad24(1,ic_h_w_size,offset);\n"
" output_values.y=(char)(*(input_ptr+offset));\n"
" offset += ic_h_w_size;\n"
" output_values.z=(char)(*(input_ptr+offset));\n"
" offset += ic_h_w_size;\n"
" output_values.w=(char)(*(input_ptr+offset));\n"
" } else if (remain_channel == 3) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(char)(*(input_ptr+offset));\n"
" offset=mad24(1,ic_h_w_size,offset);\n"
" output_values.y=(char)(*(input_ptr+offset));\n"
" offset += ic_h_w_size;\n"
" output_values.z=(char)(*(input_ptr+offset));\n"
" } else if (remain_channel == 2) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(char)(*(input_ptr+offset));\n"
" offset=mad24(1,ic_h_w_size,offset);\n"
" output_values.y=(char)(*(input_ptr+offset));\n"
" } else if (remain_channel == 1) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(char)(*(input_ptr+offset));\n"
" }\n"
" }\n"
" const int out_offset=(image_width_idx*height_width_size*((output_channel+3)/4)+image_height_idx)*4;\n"
" vstore4(output_values,0,output+out_offset);\n"
"}\n"
"#endif\n"
"#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
"// convert kernel : from int8 buffer(oihw) to int4 image(oc/4 h w ,ic oc4)\n"
"__kernel void conv2d_filter_buffer_to_nc4hw4_buffer_int4(GLOBAL_SIZE_2_DIMS\n"
" __global const uchar *input_ptr,\n"
" __private const int output_channel,\n"
" __private const int2 kernel_shape,\n"
" __private const int ic_h_w_size,\n"
" __private const int height_width_size,\n"
" __global uchar *output) {\n"
" int image_width_idx=get_global_id(0); // ic\n"
" int image_height_idx=get_global_id(1); // oc/4 h w\n"
" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
" const int input_channel_4_idx=image_width_idx;\n"
" const int output_channel_4_idx=(image_height_idx/height_width_size)*4;\n"
" const int height_width_idx=image_height_idx % height_width_size;\n"
" const int buffer_height_idx=height_width_idx/kernel_shape.y;\n"
" const int buffer_width_idx=height_width_idx % kernel_shape.y;\n"
" const int buffer_offset=output_channel_4_idx*ic_h_w_size+input_channel_4_idx*height_width_size+buffer_height_idx*kernel_shape.y+buffer_width_idx;\n"
" int index0=buffer_offset,index1=buffer_offset+ic_h_w_size,index2=buffer_offset+2*ic_h_w_size,index3=buffer_offset+3*ic_h_w_size;\n"
" uchar2 output_values_int4=(uchar2)(0,0);\n"
" uchar s0=input_ptr[index0/2];\n"
" uchar s1=output_channel_4_idx+1 >= output_channel ? 0 : input_ptr[index1/2];\n"
" uchar s2=output_channel_4_idx+1 >= output_channel ? 0 : input_ptr[index2/2];\n"
" uchar s3=output_channel_4_idx+1 >= output_channel ? 0 : input_ptr[index3/2];\n"
" output_values_int4.x=((index0 % 2) == 0 ? (s0 & 0xf0) : (s0 << 4)) | ((index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f));\n"
" output_values_int4.y=((index2 % 2) == 0 ? (s2 & 0xf0) : (s2 << 4)) | ((index3 % 2) == 0 ? (s3 >> 4) : (s3 & 0x0f));\n"
" const int out_offset=(image_width_idx*height_width_size*((output_channel+3)/4)+image_height_idx)*2;\n"
" vstore2(output_values_int4,0,output+out_offset);\n"
"}\n"
"#endif\n"
"__kernel void conv2d_1x1_weight_quant_image(GLOBAL_SIZE_2_DIMS\n"
"#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
" __global const uchar *input_ptr,\n"
"#else\n"
" __global const char *input_ptr,\n"
"#endif\n"
" __write_only image2d_t output,\n"
" __private const int input_channel,\n"
" __private const int output_channel) {\n"
" int x=get_global_id(0); // ic/4\n"
" int y=get_global_id(1); // oc/8\n"
" DEAL_NON_UNIFORM_DIM2(x,y);\n"
" const int xin=x << 2;\n"
" const int yin=y << 3;\n"
"#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
" uchar16 out=0;\n"
" uchar *out_ptr=(uchar*)&out;\n"
" for(int i=0; i<4; ++i){\n"
" int index0=yin*input_channel+xin+i;\n"
" int index1=(yin+1)*input_channel+xin+i;\n"
" int index2=(yin+2)*input_channel+xin+i;\n"
" int index3=(yin+3)*input_channel+xin+i;\n"
" int index4=(yin+4)*input_channel+xin+i;\n"
" int index5=(yin+5)*input_channel+xin+i;\n"
" int index6=(yin+6)*input_channel+xin+i;\n"
" int index7=(yin+7)*input_channel+xin+i;\n"
" uchar s0=input_ptr[index0/2];\n"
" uchar s1=input_ptr[index1/2];\n"
" uchar s2=input_ptr[index2/2];\n"
" uchar s3=input_ptr[index3/2];\n"
" uchar s4=input_ptr[index4/2];\n"
" uchar s5=input_ptr[index5/2];\n"
" uchar s6=input_ptr[index6/2];\n"
" uchar s7=input_ptr[index7/2];\n"
" out_ptr[i*4]=((index0 % 2) == 0 ? (s0 & 0xf0) : (s0 << 4)) | ((index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f));\n"
" out_ptr[i*4+1]=((index2 % 2) == 0 ? (s2 & 0xf0) : (s2 << 4)) | ((index3 % 2) == 0 ? (s3 >> 4) : (s3 & 0x0f));\n"
" out_ptr[i*4+2]=((index4 % 2) == 0 ? (s4 & 0xf0) : (s4 << 4)) | ((index5 % 2) == 0 ? (s5 >> 4) : (s5 & 0x0f));\n"
" out_ptr[i*4+3]=((index6 % 2) == 0 ? (s6 & 0xf0) : (s6 << 4)) | ((index7 % 2) == 0 ? (s7 >> 4) : (s7 & 0x0f));\n"
" }\n"
" write_imagei(output,(int2)(x,y),as_int4(out));\n"
"#else\n"
" const int inputOffset=yin*input_channel+xin;\n"
" char4 s0=vload4(0,input_ptr+inputOffset);\n"
" char4 s1=vload4(0,input_ptr+inputOffset+input_channel);\n"
" char4 s2=vload4(0,input_ptr+inputOffset+input_channel*2);\n"
" char4 s3=vload4(0,input_ptr+inputOffset+input_channel*3);\n"
" char4 s4=vload4(0,input_ptr+inputOffset+input_channel*4);\n"
" char4 s5=vload4(0,input_ptr+inputOffset+input_channel*5);\n"
" char4 s6=vload4(0,input_ptr+inputOffset+input_channel*6);\n"
" char4 s7=vload4(0,input_ptr+inputOffset+input_channel*7);\n"
" char16 out0=(char16)(s0.s0,s1.s0,s2.s0,s3.s0,s4.s0,s5.s0,s6.s0,s7.s0,s0.s1,s1.s1,s2.s1,s3.s1,s4.s1,s5.s1,s6.s1,s7.s1);\n"
" char16 out1=(char16)(s0.s2,s1.s2,s2.s2,s3.s2,s4.s2,s5.s2,s6.s2,s7.s2,s0.s3,s1.s3,s2.s3,s3.s3,s4.s3,s5.s3,s6.s3,s7.s3);\n"
" write_imagei(output,(int2)(x*2,y),as_int4(out0));\n"
" write_imagei(output,(int2)(x*2+1,y),as_int4(out1));\n"
"#endif\n"
"}\n"
"__kernel void conv2d_1x1_weight_quant_buffer(GLOBAL_SIZE_2_DIMS\n"
"#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
" __global const uchar *input_ptr,\n"
"#else\n"
" __global const char *input_ptr,\n"
"#endif\n"
" __global char *output_ptr,\n"
" __private const int input_channel,\n"
" __private const int output_channel) {\n"
" int x=get_global_id(0); // ic/4\n"
" int y=get_global_id(1); // oc/8\n"
" DEAL_NON_UNIFORM_DIM2(x,y);\n"
" const int xin=x << 2;\n"
" const int yin=y << 3;\n"
" const int outputChannelC8=(output_channel+7) >> 3;\n"
" const int inputChannelC4=(input_channel+3) >> 2;\n"
"#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
" uchar16 out=0;\n"
" uchar *out_ptr=(uchar*)&out;\n"
" for(int i=0; i<4; ++i){\n"
" int index0=yin*input_channel+xin+i;\n"
" int index1=(yin+1)*input_channel+xin+i;\n"
" int index2=(yin+2)*input_channel+xin+i;\n"
" int index3=(yin+3)*input_channel+xin+i;\n"
" int index4=(yin+4)*input_channel+xin+i;\n"
" int index5=(yin+5)*input_channel+xin+i;\n"
" int index6=(yin+6)*input_channel+xin+i;\n"
" int index7=(yin+7)*input_channel+xin+i;\n"
" uchar s0=input_ptr[index0/2];\n"
" uchar s1=input_ptr[index1/2];\n"
" uchar s2=input_ptr[index2/2];\n"
" uchar s3=input_ptr[index3/2];\n"
" uchar s4=input_ptr[index4/2];\n"
" uchar s5=input_ptr[index5/2];\n"
" uchar s6=input_ptr[index6/2];\n"
" uchar s7=input_ptr[index7/2];\n"
" out_ptr[i*4]=((index0 % 2) == 0 ? (s0 & 0xf0) : (s0 << 4)) | ((index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f));\n"
" out_ptr[i*4+1]=((index2 % 2) == 0 ? (s2 & 0xf0) : (s2 << 4)) | ((index3 % 2) == 0 ? (s3 >> 4) : (s3 & 0x0f));\n"
" out_ptr[i*4+2]=((index4 % 2) == 0 ? (s4 & 0xf0) : (s4 << 4)) | ((index5 % 2) == 0 ? (s5 >> 4) : (s5 & 0x0f));\n"
" out_ptr[i*4+3]=((index6 % 2) == 0 ? (s6 & 0xf0) : (s6 << 4)) | ((index7 % 2) == 0 ? (s7 >> 4) : (s7 & 0x0f));\n"
" }\n"
" const int outputOffset=(y*inputChannelC4+x)*16;\n"
" vstore16(as_char16(out),0,output_ptr+outputOffset);\n"
"#else\n"
" const int inputOffset=yin*input_channel+xin;\n"
" char4 s0=vload4(0,input_ptr+inputOffset);\n"
" char4 s1=vload4(0,input_ptr+inputOffset+input_channel);\n"
" char4 s2=vload4(0,input_ptr+inputOffset+input_channel*2);\n"
" char4 s3=vload4(0,input_ptr+inputOffset+input_channel*3);\n"
" char4 s4=vload4(0,input_ptr+inputOffset+input_channel*4);\n"
" char4 s5=vload4(0,input_ptr+inputOffset+input_channel*5);\n"
" char4 s6=vload4(0,input_ptr+inputOffset+input_channel*6);\n"
" char4 s7=vload4(0,input_ptr+inputOffset+input_channel*7);\n"
" char16 out0=(char16)(s0.s0,s1.s0,s2.s0,s3.s0,s4.s0,s5.s0,s6.s0,s7.s0,s0.s1,s1.s1,s2.s1,s3.s1,s4.s1,s5.s1,s6.s1,s7.s1);\n"
" char16 out1=(char16)(s0.s2,s1.s2,s2.s2,s3.s2,s4.s2,s5.s2,s6.s2,s7.s2,s0.s3,s1.s3,s2.s3,s3.s3,s4.s3,s5.s3,s6.s3,s7.s3);\n"
" const int outputOffset=(y*inputChannelC4+x)*8*4;\n"
" vstore16(out0,0,output_ptr+outputOffset);\n"
" vstore16(out1,0,output_ptr+outputOffset+16);\n"
"#endif\n"
"}\n"
"__kernel void conv2d_1x1_ic_oc_weight_quant_buffer(GLOBAL_SIZE_2_DIMS\n"
"#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
" __global const uchar *input_ptr,\n"
" __global uchar *output_ptr,//(Ci/packCin， Co/packCout,packCin， packCout)\n"
"#else\n"
" __global const char *input_ptr,\n"
" __global char *output_ptr,//(Ci/packCin， Co/packCout,packCin， packCout)\n"
"#endif\n"
" __private const int input_channel,\n"
" __private const int output_channel,\n"
" __private const int icPack,\n"
" __private const int ocPack) {\n"
" int x=get_global_id(0); // ic/icPack\n"
" int y=get_global_id(1); // oc/ocPack\n"
" DEAL_NON_UNIFORM_DIM2(x,y);\n"
" const int xin=x*icPack;\n"
" const int yin=y*ocPack;\n"
" const int inputChannelC4=(input_channel+icPack-1)/icPack;\n"
" const int outputChannelC4=(output_channel+ocPack-1)/ocPack;\n"
"#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
" const int inputOffset=(yin*input_channel+xin)/2;\n"
" const int outputOffset=((x*outputChannelC4+y)*icPack*ocPack)/2;\n"
" for(int i=0; i<icPack; ++i){\n"
" for(int j=0; j<ocPack/2; ++j){\n"
" int index0=(yin+j*2)*input_channel+xin+i;\n"
" int index1=(yin+j*2+1)*input_channel+xin+i;\n"
" uchar s0=input_ptr[index0/2];\n"
" uchar s1=input_ptr[index1/2];\n"
" s0=(index0 % 2) == 0 ? (s0 & 0xf0) : ((s0 & 0x0f) << 4);\n"
" s1=(index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f);\n"
" output_ptr[outputOffset+i*(ocPack/2)+j]=s0 | s1;\n"
" }\n"
" }\n"
"#else\n"
" const int inputOffset=yin*input_channel+xin;\n"
" const int outputOffset=(x*outputChannelC4+y)*icPack*ocPack;\n"
" for(int i=0; i<icPack; ++i){\n"
" for(int j=0; j<ocPack; ++j){\n"
" output_ptr[outputOffset+i*ocPack+j]=input_ptr[inputOffset+j*input_channel+i];\n"
" }\n"
" }\n"
"#endif\n"
"}\n"
;
}
